{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tools import *\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap 3 处理原始文本\n",
    "1.  如何访问文件内的文本？\n",
    "2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？\n",
    "3.  如何产生格式化的输出，并把结果保存在文件中？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 3.1 从网络和硬盘访问文本"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.1.1 电子书\n",
    "[Gutenberg Electronic Books Project](http://www.gutenberg.org)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(raw)=  <class 'str'>\nlen(raw)=  20497\nraw[:75]=  百家姓\n\n趙錢孫李 周吳鄭王 馮陳褚衛 蔣沈韓楊\n朱秦尤許 何呂施張 孔曹嚴華 金魏陶薑\n戚謝鄒喻 柏水竇章 雲蘇潘葛 奚範彭郎\n魯韋昌馬 \n"
     ]
    }
   ],
   "source": [
    "# 打不开就访问 http://www.gutenberg.org/ 搜索 2554\n",
    "url =\"http://www.gutenberg.org/files/25196/25196-0.txt\" #编号 25196 的文本是《百家姓》\n",
    "\n",
    "from urllib import request\n",
    "\n",
    "response = request.urlopen(url)\n",
    "raw=response.read().decode('utf8')\n",
    "print(\"type(raw)= \", type(raw))\n",
    "print(\"len(raw)= \", len(raw))\n",
    "print(\"raw[:75]= \", raw[600:675])\n",
    "# 网络比较慢，建议换下面的形式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(raw)=  <class 'str'>\nlen(raw)=  20497\nraw[:75]=  百家姓\n\n趙錢孫李 周吳鄭王 馮陳褚衛 蔣沈韓楊\n朱秦尤許 何呂施張 孔曹嚴華 金魏陶薑\n戚謝鄒喻 柏水竇章 雲蘇潘葛 奚範彭郎\n魯韋昌馬 \n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import gutenberg\n",
    "\n",
    "# gutenberg 默认的编码是 latin1\n",
    "raw = gutenberg.raw('25196-0.txt').encode('latin1').decode('utf-8')\n",
    "\n",
    "print(\"type(raw)= \", type(raw))\n",
    "print(\"len(raw)= \", len(raw))\n",
    "print(\"raw[:75]= \", raw[600:675])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(tokens)=  <class 'list'>\nlen(tokens)=  3562\ntokens[:10]=  ['\\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'Bai', 'Jia', 'Xing', ',', 'by']\n"
     ]
    }
   ],
   "source": [
    "# 分词：将字符串分解为词和标点符号\n",
    "from nltk import word_tokenize\n",
    "\n",
    "tokens = nltk.word_tokenize(raw)\n",
    "print(\"type(tokens)= \", type(tokens))\n",
    "print(\"len(tokens)= \", len(tokens))\n",
    "print(\"tokens[:10]= \", tokens[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(text)=  <class 'nltk.text.Text'>\ntokens[1020:1060]=  ['keeping', 'this', 'work', 'in', 'the', 'same', 'format', 'with', 'its', 'attached', 'full', 'Project', 'Gutenberg-tm', 'License', 'when', 'you', 'share', 'it', 'without', 'charge', 'with', 'others', '.', '1.D', '.', 'The', 'copyright', 'laws', 'of', 'the', 'place', 'where', 'you', 'are', 'located', 'also', 'govern', 'what', 'you', 'can']\ntext[1020:1060]=  ['keeping', 'this', 'work', 'in', 'the', 'same', 'format', 'with', 'its', 'attached', 'full', 'Project', 'Gutenberg-tm', 'License', 'when', 'you', 'share', 'it', 'without', 'charge', 'with', 'others', '.', '1.D', '.', 'The', 'copyright', 'laws', 'of', 'the', 'place', 'where', 'you', 'are', 'located', 'also', 'govern', 'what', 'you', 'can']\n"
     ]
    }
   ],
   "source": [
    "# 文本切片\n",
    "text = nltk.Text(tokens)\n",
    "print(\"type(text)= \", type(text))\n",
    "print(\"tokens[1020:1060]= \", tokens[1020:1060])\n",
    "print(\"text[1020:1060]= \", text[1020:1060])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Project Gutenberg-tm; Project Gutenberg; Literary Archive; Archive\nFoundation; United States; Gutenberg Literary; electronic works;\nGutenberg-tm electronic; set forth; public domain; electronic work;\nGutenberg-tm License; Bai Jia; Jia Xing; copyright holder; PROJECT\nGUTENBERG; BAI JIA; EBOOK BAI; JIA XING; Plain Vanilla\n"
     ]
    }
   ],
   "source": [
    "# 寻找文档中的固定搭配\n",
    "text.collocations()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "raw.find(趙錢孫李')=  607\nraw.rfind('周吳鄭王')=  612\n"
     ]
    }
   ],
   "source": [
    "# 寻找文档中的关键信息（如：文本名称、作者名称、扫描和校对人名称、许可证信息等等）\n",
    "print(\"raw.find(趙錢孫李')= \",raw.find('趙錢孫李'))\n",
    "print(\"raw.rfind('周吳鄭王')= \", raw.rfind('周吳鄭王'))"
   ]
  },
  {
   "source": [
    "### 3.1.2 处理 HTML文件"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(html)=  <class 'bytes'>\nhtml[:60]=  b'<?xml version=\"1.0\" encoding=\"ascii\" ?>\\n\\n<script language=\"j'\ntype(html)=  <class 'str'>\nhtml[:60]=  <?xml version=\"1.0\" encoding=\"ascii\" ?>\n\n<script language=\"j\n"
     ]
    }
   ],
   "source": [
    "url = \"http://news.bbc.co.uk/2/hi/health/2284783.stm\"\n",
    "url = \"https://news.sina.com.cn/c/2019-06-22/doc-ihytcerk8630195.shtml\"\n",
    "url = \"http://www.nltk.org/book/ch03.html\"\n",
    "\n",
    "html = request.urlopen(url).read()\n",
    "print(\"type(html)= \",type(html))\n",
    "print(\"html[:60]= \",html[:60])\n",
    "html = html.decode('utf8')  # bytes 与 str 的转码\n",
    "print(\"type(html)= \",type(html))\n",
    "print(\"html[:60]= \",html[:60])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "tokens[:20]=  ['ch03.rst2', '3', 'Processing', 'Raw', 'Text', 'The', 'most', 'important', 'source', 'of', 'texts', 'is', 'undoubtedly', 'the', 'Web', '.', 'It', \"'s\", 'convenient', 'to']\nraw[:60]=  \n\n\n\n\n\n\nch03.rst2\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
     ]
    }
   ],
   "source": [
    "# 从 HTML 文件中获取文本\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "raw = BeautifulSoup(html, 'html.parser').get_text()\n",
    "tokens = word_tokenize(raw)\n",
    "print(\"tokens[:20]= \",tokens[:20])\n",
    "print(\"raw[:60]= \",raw[:60])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Displaying 1 of 1 matches:\nout strings , files , and regular expressions . Since so much text on the web i\nDisplaying 1 of 1 matches:\nur Python knowledge and learn about strings , files , and regular expressions .\nof\nlanguage\nmaterial\n?\nHow\ncan\nwe\nsplit\ndocuments\nup\ninto\nindividual\nwords\nand\n"
     ]
    }
   ],
   "source": [
    "# 定位感兴趣的文本\n",
    "tokens_part = tokens[96:399]\n",
    "text = nltk.Text(tokens_part)\n",
    "text.concordance('expressions')\n",
    "text.concordance('Strings')\n",
    "for i, word in enumerate(text):\n",
    "    if i<=13:\n",
    "        print(word)"
   ]
  },
  {
   "source": [
    "### 3.1.3 处理搜索引擎的结果。\n",
    "\n",
    "-   优点：很容易获得感兴趣的文本\n",
    "-   缺点：数据量大，并且随时会发现改变"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.1.4 处理 RSS 订阅"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "llog['feed']['title']=  Project Gutenberg Recently Posted or Updated EBooks\nlen(llog.entries)=  19\n"
     ]
    }
   ],
   "source": [
    "import feedparser\n",
    "\n",
    "llog = feedparser.parse('http://www.gutenberg.org/cache/epub/feeds/today.rss')\n",
    "print(\"llog['feed']['title']= \", llog['feed']['title'])\n",
    "print('len(llog.entries)= ', len(llog.entries))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "post=  {'title': 'The Poetical Works of William Lisle Bowles, Vol. 1 by William Lisle Bowles', 'title_detail': {'type': 'text/plain', 'language': None, 'base': 'http://www.gutenberg.org/cache/epub/feeds/today.rss', 'value': 'The Poetical Works of William Lisle Bowles, Vol. 1 by William Lisle Bowles'}, 'links': [{'rel': 'alternate', 'type': 'text/html', 'href': 'https://www.gutenberg.org/ebooks/18915'}], 'link': 'https://www.gutenberg.org/ebooks/18915', 'summary': 'Language: English', 'summary_detail': {'type': 'text/html', 'language': None, 'base': 'http://www.gutenberg.org/cache/epub/feeds/today.rss', 'value': 'Language: English'}}\npost.title=  The Poetical Works of William Lisle Bowles, Vol. 1 by William Lisle Bowles\ntitle_detail=  {'type': 'text/plain', 'language': None, 'base': 'http://www.gutenberg.org/cache/epub/feeds/today.rss', 'value': 'The Poetical Works of William Lisle Bowles, Vol. 1 by William Lisle Bowles'}\nvalue=  The Poetical Works of William Lisle Bowles, Vol. 1 by William Lisle Bowles\n"
     ]
    }
   ],
   "source": [
    "post = llog.entries[2]\n",
    "print(\"post= \", post)\n",
    "print(\"post.title= \", post.title)\n",
    "\n",
    "title_detail = post.title_detail\n",
    "print(\"title_detail= \", title_detail)\n",
    "\n",
    "value= title_detail.value\n",
    "print(\"value= \", value)"
   ]
  },
  {
   "source": [
    "### 3.1.5 读取本地文件"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(raw)=  <class 'str'>\nraw[100:150]=  ram\nAccad\nAchbor\nAdah\nAdam\nAdbeel\nAdmah\nAdullamite\n"
     ]
    }
   ],
   "source": [
    "f = open('PythonNLP/dict.files/output.txt')\n",
    "raw = f.read()\n",
    "print(\"type(raw)= \", type(raw))\n",
    "print(\"raw[100:150]= \",raw[100:150])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >strip line< ---------------\n!\n'\n(\n)\n,\n,)\n.\n.)\n:\n;\n;)\n"
     ]
    }
   ],
   "source": [
    "show_subtitle(\"strip line\")\n",
    "f=open('PythonNLP/dict.files/output.txt', 'rU')\n",
    "for i, line in enumerate(f):\n",
    "    if i<=10:\n",
    "        print(line, end='')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(raw)=  <class 'str'>\nraw[100:150]=   School)\n\nThe pale Usher--threadbare in coat, hear\n"
     ]
    }
   ],
   "source": [
    "# 直接读取NLTK中的语料库文件\n",
    "# 'r' 表示以只读方式打开文件；'U' 表示「通用」，忽略不同的换行符公约\n",
    "path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')\n",
    "raw = open(path, 'rU').read()\n",
    "print(\"type(raw)= \", type(raw))\n",
    "print(\"raw[100:150]= \",raw[100:150])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['machine_learning_note',\n",
       " 'NLTK-Python-CN',\n",
       " 'SiBert_tensorflow',\n",
       " 'StudyNotes-CN']"
      ]
     },
     "metadata": {},
     "execution_count": 16
    }
   ],
   "source": [
    "# 看看当前目录下面还有啥文件？\n",
    "import os\n",
    "\n",
    "os.listdir('..')"
   ]
  },
  {
   "source": [
    "### 3.1.6 从 PDF、MS Word 及其他二进制格式中提取文本"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.1.7 捕获用户输入"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "type(tokens)=  <class 'list'>\nYou typed 3 words.\ntype(text)=  <class 'nltk.text.Text'>\ntype(words)=  <class 'list'>\ntype(vocab)=  <class 'list'>\n"
     ]
    }
   ],
   "source": [
    "sent = input('Enter somme text: ')\n",
    "tokens = word_tokenize(sent)\n",
    "print(\"type(tokens)= \",type(tokens))\n",
    "print('You typed', len(tokens), 'words.')\n",
    "text = nltk.Text(tokens)\n",
    "print(\"type(text)= \",type(text))\n",
    "words = [w.lower() for w in tokens]\n",
    "print(\"type(words)= \",type(words))\n",
    "vocab = sorted(set(words))\n",
    "print(\"type(vocab)= \",type(vocab))"
   ]
  },
  {
   "source": [
    "### 3.1.8 NLP 的流程\n",
    "NLP处理流程：\n",
    "-   打开一个URL，读里面HTML 格式的内容，去除标记，并选择字符的切片，\n",
    "-   然后分词，是否转换为nltk.Text 对象是可选择的。\n",
    "-   也可以将所有词汇小写并提取词汇表。\n",
    "\n",
    "注：中文需要提供分词功能"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >html[:20]< ---------------\n<!doctype html>\n<htm\n--------------- >raw[:20]< ---------------\n知乎 - 有问题，上知乎免密码登录密码登\n--------------- >tokens[:20]< ---------------\n['知乎', '-', '有问题，上知乎免密码登录密码登录获取短信验证码接收语音验证码注册/登录未注册手机验证后自动登录，注册即代表同意《知乎协议》《隐私保护指引》社交帐号登录微信QQ微博\\u200b开通机构号\\u200b下载知乎', 'App知乎专栏圆桌发现移动应用联系我们来知乎工作注册机构号©', '2020', '知乎京', 'ICP', '证', '110745', '号京', 'ICP', '备', '13052560', '号', '-', '1京公网安备', '11010802010035', '号出版物经营许可证侵权举报网上有害信息举报专区儿童色情信息举报专区违法和不良信息举报：010-82716601']\n--------------- >text[:20]< ---------------\n['知乎', '-', '有问题，上知乎免密码登录密码登录获取短信验证码接收语音验证码注册/登录未注册手机验证后自动登录，注册即代表同意《知乎协议》《隐私保护指引》社交帐号登录微信QQ微博\\u200b开通机构号\\u200b下载知乎', 'App知乎专栏圆桌发现移动应用联系我们来知乎工作注册机构号©', '2020', '知乎京', 'ICP', '证', '110745', '号京', 'ICP', '备', '13052560', '号', '-', '1京公网安备', '11010802010035', '号出版物经营许可证侵权举报网上有害信息举报专区儿童色情信息举报专区违法和不良信息举报：010-82716601']\n--------------- >words[:20]< ---------------\n['知乎', '-', '有问题，上知乎免密码登录密码登录获取短信验证码接收语音验证码注册/登录未注册手机验证后自动登录，注册即代表同意《知乎协议》《隐私保护指引》社交帐号登录微信qq微博\\u200b开通机构号\\u200b下载知乎', 'app知乎专栏圆桌发现移动应用联系我们来知乎工作注册机构号©', '2020', '知乎京', 'icp', '证', '110745', '号京', 'icp', '备', '13052560', '号', '-', '1京公网安备', '11010802010035', '号出版物经营许可证侵权举报网上有害信息举报专区儿童色情信息举报专区违法和不良信息举报：010-82716601']\n--------------- >vocab[:20]< ---------------\n['-', '11010802010035', '110745', '13052560', '1京公网安备', '2020', 'app知乎专栏圆桌发现移动应用联系我们来知乎工作注册机构号©', 'icp', '号', '号京', '号出版物经营许可证侵权举报网上有害信息举报专区儿童色情信息举报专区违法和不良信息举报：010-82716601', '备', '有问题，上知乎免密码登录密码登录获取短信验证码接收语音验证码注册/登录未注册手机验证后自动登录，注册即代表同意《知乎协议》《隐私保护指引》社交帐号登录微信qq微博\\u200b开通机构号\\u200b下载知乎', '知乎', '知乎京', '证']\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "url = \"https://www.zhihu.com/\"\n",
    "html = request.urlopen(url).read().decode('utf8')\n",
    "raw = BeautifulSoup(html).get_text()\n",
    "raw = raw[:500]\n",
    "tokens = word_tokenize(raw)\n",
    "tokens = tokens[:390]\n",
    "text = nltk.Text(tokens)\n",
    "words = [w.lower() for w in text]\n",
    "vocab = sorted(set(words))\n",
    "show_subtitle(\"html[:20]\")\n",
    "print(html[:20])\n",
    "show_subtitle(\"raw[:20]\")\n",
    "print(raw[:20])\n",
    "show_subtitle(\"tokens[:20]\")\n",
    "print(tokens[:20])\n",
    "show_subtitle(\"text[:20]\")\n",
    "print(text[:20])\n",
    "show_subtitle(\"words[:20]\")\n",
    "print(words[:20])\n",
    "show_subtitle(\"vocab[:20]\")\n",
    "print(vocab[:20])"
   ]
  }
 ]
}